The goal of this document is to look at residuals and see if we could use the residuals to look at some sort of adjusted “counts” (residuals can be negative, so we should probably find another name than “counts”). First, I fitted zinbwave without the batches in the X, then I included the batches in the X. For each scenario, I looked at naive, naive on the log scale, Pearson, and deviance residuals. I plotted the PCA of the residuals with two coloring (batch and clusters) and boxplots of the residuals for each cell.
I am still using the data Davide gave me a while ago. And I think now it is not the same data Davide is using in its most recent vignette (that is zinb_clustering_over_k.Rmd). It should not be very important but it would be nice if we have a common dataset we can work on.
load("../data/Expt4c2b_filtdata.Rda")
load("../data/E4c2b_slingshot_wsforkelly.RData")
de = read.csv('../data/oe_markers.txt', stringsAsFactors = F, header = F)
de = de$V1
Here we only look at all the genes.
names(batch) <- colnames(counts)
counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]
#vars <- rowVars(log1p(counts))
#names(vars) <- rownames(counts)
#vars <- sort(vars, decreasing = TRUE)
#core <- counts[names(vars)[1:1000],]
core = counts
We have 616 cells. To speed up the computations, we will focus on the top 1,000 most variable genes.
dim(core)
## [1] 12781 616
core[1:3, 1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER 1 3022 2329
## Xkr4 0 0 0
## LOC102640625 1 9 2
Cells have been processed in 18 different batches
table(batch)
## batch
## GBC08A GBC08B GBC09A GBC09B P01 P02 P03A P03B P04 P05
## 36 33 30 17 25 43 37 33 14 20
## P06 P10 P11 P12 P13 P14 Y01 Y04
## 39 36 44 40 47 39 51 32
Cells have been clustered into 13 different clusters
table(clus.labels)
## clus.labels
## 1 2 3 4 5 7 8 9 10 11 12 14 15
## 91 25 56 40 96 60 28 79 26 22 35 26 32
Batches are kind of confounded with the biology
table(data.frame(batch = as.vector(batch),
cluster = clus.labels))
## cluster
## batch 1 2 3 4 5 7 8 9 10 11 12 14 15
## GBC08A 0 2 12 9 0 0 0 0 0 2 0 2 9
## GBC08B 0 7 5 3 0 0 0 1 2 4 0 5 6
## GBC09A 0 1 5 9 0 0 0 1 1 0 0 6 7
## GBC09B 0 2 2 7 0 0 0 3 0 0 0 3 0
## P01 0 2 4 3 15 1 0 0 0 0 0 0 0
## P02 2 0 9 3 15 3 3 2 3 0 2 1 0
## P03A 3 0 3 0 12 2 9 4 2 0 2 0 0
## P03B 1 2 1 1 11 1 2 10 1 1 2 0 0
## P04 0 0 0 0 11 1 0 1 1 0 0 0 0
## P05 0 0 0 1 11 3 0 1 0 2 2 0 0
## P06 1 2 3 0 8 2 4 8 4 1 2 2 2
## P10 3 1 4 0 4 5 9 2 0 2 5 0 1
## P11 2 1 1 0 1 5 1 22 3 1 6 0 1
## P12 0 2 0 0 4 10 0 8 2 3 6 4 1
## P13 1 2 4 0 4 15 0 4 5 6 1 3 2
## P14 0 0 1 2 0 12 0 12 2 0 7 0 3
## Y01 47 1 1 2 0 0 0 0 0 0 0 0 0
## Y04 31 0 1 0 0 0 0 0 0 0 0 0 0
plotBoxplot <- function(y, main = '', col, log = F){
if (log) y = log2(y + 1)
boxplot(y, main=main, col=col, staplewex=0, outline=0,
border=col, xaxt='n')
abline(h=0)
}
compute.naive.residuals <- function(Y, zinb){
mu_hat = t(getMu(zinb))
pi_hat = t(getPi(zinb))
Y_hat = (1 - pi_hat) * mu_hat
Y - Y_hat
}
compute.log.naive.residuals <- function(Y, zinb){
mu = t(getMu(zinb))
pi = t(getPi(zinb))
phi = matrix(rep(getPhi(zinb), ncol(Y)), ncol = ncol(Y))
var_hat = (1 - pi) * mu * (1 + mu * (phi + pi))
Y_hat = (1 - pi) * mu
log_Y_hat_plus_1 = log(1 + Y_hat) - var_hat / (2*(1 + Y_hat)^2)
log(Y + 1) - log_Y_hat_plus_1
}
compute.pearson.residuals <- function(Y, zinb){
num = compute.naive.residuals(Y, zinb)
mu = t(getMu(zinb))
pi = t(getPi(zinb))
phi = matrix(rep(getPhi(zinb), ncol(Y)), ncol = ncol(Y))
var_hat = (1 - pi) * mu * (1 + mu * (phi + pi))
num / sqrt(var_hat)
}
compute.zinb.loglik <- function(Y, zinb){
mu = t(getMu(zinb))
theta = getTheta(zinb)
theta_mat = matrix(rep(theta, ncol(Y), ncol = ncol(Y)))
pi = t(getPi(zinb))
log( pi * (Y == 0) + (1 - pi) * dnbinom(Y, size = theta, mu = mu) )
}
compute.deviance.residuals <- function(Y, zinb){
mu_hat = t(getMu(zinb))
pi_hat = t(getPi(zinb))
Y_hat = (1 - pi_hat) * mu_hat
ll = compute.zinb.loglik(Y, zinb)
sign = 1*(Y - Y_hat > 0)
sign[sign == 0] = -1
sign * sqrt(-2 * ll)
}
Let’s run zinbwave with K = 0 and X with an intercept only (no batch).
print(system.time(zinb <- zinbFit(core, ncores = NCORES, K = 0)))
## user system elapsed
## 1827.539 38.656 532.069
Rn = compute.naive.residuals(core, zinb)
Rn[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER -1054.182261 2094.19514029 1668.865489
## Xkr4 -13.532615 -11.20593481 -7.164258
## LOC102640625 -8.996483 0.03197009 -4.641870
PCA (centered not scale) of residuals, first colored by batch (left), then by clusters (right). On the left side, we should not see patterns whereas on the right hand side we should.
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
de_1000 = de[de %in% rownames(Rn)]
length(de_1000)
## [1] 115
head(de_1000)
## [1] "Ascl1" "Ccnd1" "Kit" "Lgr5" "Neurod1" "Neurog1"
Each boxplot is for a cell. Colors correspond to the batches. When colors are the same but boxplots are not next to each others it corresponds to different batches. We would expect that the residuals look similar for the different batches.
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
breaks = setBreaks(Rn[de_1000, ], breaks = 0.99)
heatmap.2(Rn[de_1000, ], breaks = breaks,
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Naive Residuals')
Rn = compute.log.naive.residuals(core, zinb)
Rn[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER -3.8809820 3.626390 3.8244528
## Xkr4 10.6931681 11.616382 12.9172753
## LOC102640625 -0.1836331 1.497418 0.4742386
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
heatmap.2(Rn[de_1000, ],
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Log naive residuals')
Rp = compute.pearson.residuals(core, zinb)
Rp[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER -0.4566869 1.019352273 1.1145278
## Xkr4 -0.1800795 -0.172770975 -0.1601206
## LOC102640625 -0.4691034 0.001855295 -0.3618107
pca = prcomp(t(Rp))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rp_order = Rp[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rp_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
breaks = setBreaks(Rp[de_1000, ], breaks = 0.99)
heatmap.2(Rp[de_1000, ],breaks = breaks,
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Pearson residuals')
Rd = compute.deviance.residuals(core, zinb)
Rd[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER -2.7838750 4.525812 4.4807724
## Xkr4 -0.4961831 -0.473638 -0.4331959
## LOC102640625 -2.1494309 2.835279 -2.3243557
pca = prcomp(t(Rd))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rd_order = Rd[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rd_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
heatmap.2(Rd[de_1000, ],
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Deviance residuals')
Let’s run zinbwave with K = 0 and X with an intercept only (no batch).
mod = model.matrix( ~ batch)
print(system.time(zinb_batch <- zinbFit(core, ncores = NCORES, K = 0,
X = mod)))
## user system elapsed
## 3300.844 55.424 715.158
Rn = compute.naive.residuals(core, zinb_batch)
Rn[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER -2156.802509 1023.810053 891.511804
## Xkr4 -5.206859 -4.448180 -2.818703
## LOC102640625 -3.536348 4.783501 -1.053260
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
breaks = setBreaks(Rn[de_1000, ], breaks = 0.99)
heatmap.2(Rn[de_1000, ], breaks = breaks,
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Naive Residuals')
Rn = compute.log.naive.residuals(core, zinb)
Rn[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER -3.8809820 3.626390 3.8244528
## Xkr4 10.6931681 11.616382 12.9172753
## LOC102640625 -0.1836331 1.497418 0.4742386
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
heatmap.2(Rn[de_1000, ],
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Log naive residuals')
Rp = compute.pearson.residuals(core, zinb_batch)
Rp[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER -0.5703063 0.2916186 0.3514468
## Xkr4 -0.2181257 -0.2086423 -0.1942887
## LOC102640625 -0.4392369 0.6375194 -0.1911441
pca = prcomp(t(Rp))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rp_order = Rp[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rp_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
breaks = setBreaks(Rp[de_1000, ], breaks = 0.99)
heatmap.2(Rp[de_1000, ],breaks = breaks,
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Pearson residuals')
Rd = compute.deviance.residuals(core, zinb_batch)
Rd[1:3,1:3]
## OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER -2.8670993 4.4156535 4.3602138
## Xkr4 -0.5505718 -0.5241628 -0.4794994
## LOC102640625 -2.0274327 2.8304297 -2.2227802
pca = prcomp(t(Rd))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
main = "PCA of residuals\ncolor=cluster")
par(mfrow = c(1,1))
Rd_order = Rd[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rd_order, main = 'Boxplot of residuals\ncolor=batch',
col = col_order)
heatmap.2(Rd[de_1000, ],
col = colorRampPalette(c('blue', 'yellow'))(51),
scale = 'none', trace = 'none',
ColSideColors = as.character(clus.labels),
labCol = '', main = 'Deviance residuals')
As batches are somehow confounded with the biology, I don’t know if we should adjust or not for the batches. For the residuals, it seems that the Pearson residuals when not adjusting for batches is the most informative (at least, when looking at PC 1 and 2).
sessionInfo()
## R version 3.3.3 (2017-03-06)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS release 6.9 (Final)
##
## locale:
## [1] LC_CTYPE=ja_JP.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=ja_JP.UTF-8 LC_COLLATE=ja_JP.UTF-8
## [5] LC_MONETARY=ja_JP.UTF-8 LC_MESSAGES=ja_JP.UTF-8
## [7] LC_PAPER=ja_JP.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=ja_JP.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 methods stats graphics grDevices utils
## [8] datasets base
##
## other attached packages:
## [1] clusterExperiment_1.2.0 gplots_3.0.1
## [3] matrixStats_0.52.2 zinbwave_0.99.1
## [5] SummarizedExperiment_1.4.0 Biobase_2.34.0
## [7] GenomicRanges_1.26.4 GenomeInfoDb_1.10.3
## [9] IRanges_2.8.2 S4Vectors_0.12.2
## [11] BiocGenerics_0.20.0 knitr_1.16
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-131 bitops_1.0-6 bold_0.4.0
## [4] progress_1.1.2 doParallel_1.0.10 RColorBrewer_1.1-2
## [7] httr_1.2.1 rprojroot_1.2 prabclus_2.2-6
## [10] numDeriv_2016.8-1 tools_3.3.3 backports_1.1.0
## [13] R6_2.2.1 KernSmooth_2.23-15 DBI_0.6-1
## [16] lazyeval_0.2.0 colorspace_1.3-2 ade4_1.7-6
## [19] trimcluster_0.1-2 nnet_7.3-12 prettyunits_1.0.2
## [22] gridExtra_2.2.1 glmnet_2.0-10 pspline_1.0-17
## [25] xml2_1.1.1 pkgmaker_0.22 diptest_0.75-7
## [28] caTools_1.17.1 scales_0.4.1 DEoptimR_1.0-8
## [31] mvtnorm_1.0-6 robustbase_0.92-7 NMF_0.20.6
## [34] stringr_1.2.0 digest_0.6.12 rmarkdown_1.5
## [37] XVector_0.14.1 htmltools_0.3.6 stabledist_0.7-1
## [40] ADGofTest_0.3 limma_3.30.13 rlang_0.1.1
## [43] howmany_0.3-1 jsonlite_1.4 mclust_5.3
## [46] gtools_3.5.0 dplyr_0.5.0 dendextend_1.5.2
## [49] RCurl_1.95-4.8 magrittr_1.5 modeltools_0.2-21
## [52] Matrix_1.2-10 Rcpp_0.12.11 munsell_0.4.3
## [55] ape_4.1 abind_1.4-5 viridis_0.4.0
## [58] stringi_1.1.5 whisker_0.3-2 yaml_2.1.14
## [61] MASS_7.3-47 zlibbioc_1.20.0 flexmix_2.3-14
## [64] MAST_1.0.5 plyr_1.8.4 grid_3.3.3
## [67] gdata_2.17.0 rncl_0.8.2 lattice_0.20-35
## [70] splines_3.3.3 uuid_0.1-2 taxize_0.8.4
## [73] fpc_2.1-10 rngtools_1.2.4 softImpute_1.4
## [76] reshape2_1.4.2 codetools_0.2-15 XML_3.98-1.7
## [79] evaluate_0.10 RNeXML_2.0.7 data.table_1.10.4
## [82] foreach_1.4.3 locfdr_1.1-8 tidyr_0.6.3
## [85] gtable_0.2.0 reshape_0.8.6 assertthat_0.2.0
## [88] kernlab_0.9-25 ggplot2_2.2.1 gridBase_0.4-7
## [91] phylobase_0.8.4 xtable_1.8-2 class_7.3-14
## [94] pcaPP_1.9-61 viridisLite_0.2.0 gsl_1.9-10.3
## [97] tibble_1.3.1 copula_0.999-16 iterators_1.0.8
## [100] registry_0.3 cluster_2.0.6